#### Apply DLE tests to Alvarez et al (2019) ####

#### Packages and global options ####
library(tidyverse) # data wrangling
library(here) # relative path management
library(data.table) # fast data.frame operations
library(DeclareDesign) # design-based estimators
library(ggh4x) # nested facets in ggplot
library(coin) # conditional independence tests
# devtools::install_github("li-xinran/RIQITE")
library(RIQITE) ##coin wrapper for Stephenson's rank test
library(RItools) # Balance tables

# ggplot global options
theme_set(theme_bw(base_size = 20))

#### Data ####
load(here("attn_rep.RData"))

cali = dle

remove(dle)

#### Explore ####
table(cali$listB, cali$trt_A, useNA = "ifany") # one dude did not respond

#### Table 1 ####
# Show respondents per condition
design_tab = with(cali, table(Treatment = experiment, Placement = sensitive))

colnames(design_tab) = c("List A", "List B")

rownames(design_tab) = c("Organization X", "Organization Y")

design_tab

#### Figure 1 ####
# Compare list experiment estimates
# Each combination implies an independent conventional (aka single) list experiment
#The idea is that different combinations may lead to different prevalence estimates

# List A estimates
a = cali %>% 
  split(.$experiment) %>% 
  map(~difference_in_means(listA ~ trt_A, data = .)) %>% 
  map(tidy) %>% 
  bind_rows(.id = "experiment")

# List B estimates
b = cali %>% 
  split(.$experiment) %>% 
  map(~difference_in_means(listB ~ trt_B, data = .)) %>% 
  map(tidy) %>% 
  bind_rows(.id = "experiment")

# DLE estimate
pool = cali %>% 
  mutate(id = row_number()) %>% 
  pivot_longer(cols = c(listA, listB)) %>% 
  rename(list = name, count = value) %>% 
  mutate(Z = ifelse(trt_A == 1 & list == "listA" | trt_B == 1 & list == "listB",
                    1, 0)) %>% 
  split(.$experiment) %>% 
  map(~ lm_robust(count ~ Z + list, clusters = id, se_type = "stata", data = .)) %>% 
  map(tidy) %>% 
  bind_rows(.id = "experiment") %>% 
  filter(term == "Z")

# Combine
est_df = rbind(a, b, pool)

# Visualize
## Reorder terms
est_df$term = fct_relevel(est_df$term, "trt_A", "trt_B", "Z")

## Experiment labels
est_df = est_df %>% 
  mutate(experiment = ifelse(experiment == "X", "Organization X", "Organization Y"))

## Plot
attn_plot = ggplot(est_df) +
  aes(x = term, y = estimate, shape = term) +
  geom_hline(yintercept = 0, linetype = "dashed") + 
  geom_point(size = 4, position = position_dodge(width = 0.5)) +
  geom_linerange(aes(x = term, ymin = conf.low, ymax = conf.high), 
                 size = 1, position = position_dodge(width = 0.5)) +
  facet_grid(~ experiment) +
  theme(legend.position = "none") +
  labs(x = "Estimator", y = "Estimate") +
  scale_x_discrete(labels = c("List A", "List B", "Pooled"))

attn_plot

### Tables 4 and D3 ####
# Apply tests to Alvarez et al (2019)

# Difference in differences
dd = cali %>% 
  mutate(id = row_number()) %>% 
  pivot_longer(cols = c(listA, listB)) %>% 
  rename(list = name, count = value) %>% 
  mutate(Z = ifelse(trt_A == 1 & list == "listA" | trt_B == 1 & list == "listB",
                    1, 0)) %>% 
  split(.$experiment) %>% 
  map(~ lm_robust(count ~ Z * list, clusters = id, data = .)) %>% 
  map(tidy) %>% 
  bind_rows(.id = "experiment") %>% 
  filter(term == "Z:listlistB") %>% 
  mutate(m = NA) %>% 
  select(experiment, statistic = estimate, m, p.value) 

# need to reverse the sign of the statistic  
dd$statistic = dd$statistic * -1

# Function for Stephenson's signed rank test
## This will only work with the current data
steph_test = function(m){
  out = cali %>% 
    group_by(experiment) %>% 
    mutate(diff = listA - listB,
           sgn_diff = (trt_A - trt_B) * diff,
           rank = stephenson_rank(abs(diff), subset_size = m),
           sgn_rank = ifelse(sgn_diff > 0, rank, rank*-1)) %>%
    summarize(statistic = sum(sgn_rank, na.rm = TRUE),
              m = m,
              p.value = pvalue(stephenson_test(diff ~ trt_A,
                                               data = .,
                                               subset_size = m,
                                               alternative = "less")))
  
  return(out)
}


# Now apply to an arbitrary number of values
subsets = c(2, 5, 10, 50)

steph_tab = rbindlist(lapply(subsets, steph_test))

# Combine
attn_tests = rbind(dd, steph_tab)

attn_tests

#### Figure D2 ####
# Mean outcomes per experimental condition

# Calculate means
avg = cali %>% 
  group_by(experiment, trt_A) %>% 
  summarize(listA = mean(listA),
            listB = mean(listB, na.rm = TRUE)) %>% 
  pivot_longer(c(listA, listB), names_to = "list", values_to = "count") %>% 
  mutate(Z = ifelse(list == "listA", trt_A, 1-trt_A),
         experiment = ifelse(experiment == "X",
                             "Organization X",
                             "Organization Y"))

# Visualize
means = ggplot(avg) +
  aes(x = list, y = count, fill = as.factor(Z)) +
  geom_col(position = "dodge") +
  facet_grid(~ experiment) +
  scale_fill_grey(name = "Sensitive item",
                  labels = c("No", "Yes")) +
  scale_x_discrete(labels = c("A", "B")) +
  labs(x = "List",
       y = "Mean response") +
  theme(legend.position = "bottom")

means

#### Figure D2 ####
# Responses to control condition across baseline lists

# need to pivot data to subset to control responses only
controls = cali %>% 
  pivot_longer(cols = c(listA, listB),
               names_to = "list",
               values_to = "count") %>% 
  mutate(z = ifelse(list == "listA" & sensitive == "A" |
                      list == "listB" & sensitive == "B", 1, 0),
         experiment = ifelse(experiment == "X", 
                             "Organization X", "Organization Y"),
         list = ifelse(list == "listA", "List A", "List B")) %>% 
  filter(z == 0) %>% 
  select(experiment, list, count)

# Visualize
control_plot = ggplot(controls) +
  aes(x = count) +
  labs(x = "Response",
       y = "Count") +
  geom_bar() +
  facet_nested(list ~ experiment)

control_plot

#### Table D1 ####
# Comparing means across sensitive items

# Recode treatment and subset to covariates
bal_df = cali %>% 
  mutate(experiment = ifelse(experiment == "Y", 1, 0)) %>% 
  select(experiment, trt_B, female, age, f.educ, f.region)

# Calculate balance
bal_exp = xBalance(experiment ~ .,
                   data = bal_df,
                   na.rm = TRUE, 
                   report = c("adj.means", "adj.mean.diffs", "std.diffs", 
                              "p.values", "chisquare.test"))

# Visualize
bal_exp_tab = data.frame(bal_exp$results)

colnames(bal_exp_tab) = c("Experiment X", "Experiment Y", "Adj. diff.", "Std. diff.", "p-value")

rownames(bal_exp_tab) = c("List B treatment", "Female", "Age","No high school", "High school", 
                          "Some college", "College", "Post-graduate", 
                          "Bay Area", "SoCal (non-LA)", "Los Angeles", "Central/Southern",
                          "North/Mountain", "Central Valley")

bal_exp_tab

#### Table D2 ####
# Comparing means across treatment schedules

# Calculate balance
bal_sen = xBalance(trt_B ~ .,
                   data = bal_df,
                   na.rm = TRUE, 
                   report = c("adj.means", "adj.mean.diffs", "std.diffs", 
                              "p.values", "chisquare.test"))

# Visualize
bal_sen_tab = data.frame(bal_sen$results)

colnames(bal_sen_tab) = c("List A", "List B", "Adj. diff.", "Std. diff.", "p-value")

rownames(bal_sen_tab) = c("Experiment Y", "Female", "Age","No high school", "High school", 
                          "Some college", "College", "Post-graduate", 
                          "Bay Area", "SoCal (non-LA)", "Los Angeles", "Central/Southern",
                          "North/Mountain", "Central Valley")

bal_sen_tab
